import requests
import pandas as pd
from io import StringIO
import time

# 设置请求头，模拟浏览器访问
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}

# 指定年份和季度
year = 2022
quarter = 4

# 指定起止页码
page_start = 1
page_end = 3

# 开始多页爬取
data_list = []
for page in range(page_start, page_end + 1):

    # 定义目标URL
    url = f'https://vip.stock.finance.sina.com.cn/q/go.php/vFinanceAnalyze/kind/profit/index.phtml?reportdate={year}&quarter={quarter}&p={page}'

    # 发起请求并获取网页源代码
    response = requests.get(url=url, headers=headers)
    response.encoding = 'gbk'
    html = response.text

    # 从网页源代码中提取数据表格
    table_list = pd.read_html(io=StringIO(html), attrs={'id': 'dataTable'})
    data = table_list[0]
    data['股票代码'] = data['股票代码'].astype(dtype='string').str.zfill(width=6)
    data_list.append(data)
    
    # 适当暂停，以免触发反爬
    time.sleep(3)

# 合并所有数据并导出
df = pd.concat(objs=data_list, ignore_index=True)
df.to_excel('2022年上市公司盈利能力年报.xlsx', index=False)
